started: AL13Jul2019
last updated: AL01ct2019
Eigenvectors calculated from 622 variants (non-rare, not in LD)
for 3,019 samples = 2,504 kgen + 515 wecare-nfe (258UBC, 257CBC)
Suggest 34 clear ethnic outliers using manually selected thresholds
(previously, using a smaller number of variants in ampliseq-nfe analysis,
there was 14 outliers more; it is possible to strignten thretholds though …)
Sys.time()
## [1] "2019-10-01 20:10:19 BST"
rm(list=ls())
graphics.off()
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s12_joined_ampliseq_1kgp_PCA/s06_explore_joined_PCA_plots"
opts_knit$set(root.dir = base_folder)
options(stringsAsFactors = F)
options(warnPartialMatchArgs = T,
warnPartialMatchAttr = T,
warnPartialMatchDollar = T)
#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588
# Sequencing data
source_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s11_remove_BRCA_PALB_carriers"
load(paste(source_folder, "s01_exclude_BRCA1_BCRA2_PALB2_carriers.RData", sep="/"))
base_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s12_joined_ampliseq_1kgp_PCA/s06_explore_joined_PCA_plots"
# Eigenvectors and eigenvalues
source_folder="/Users/alexey/Documents/wecare/ampliseq/v05_ampliseq_only/s12_joined_ampliseq_1kgp_PCA/s05_calculate_joined_1kgp_ampliseq_PCs/data/s04_pca"
eigenvectors_file <- paste(source_folder, "ampliseq_1kg_622_3019_100PCs.eigenvec", sep="/")
eigenvectors.df <- read.table(eigenvectors_file, header=T, sep="\t",quote="")
eigenvalues_file <- paste(source_folder, "ampliseq_1kg_622_3019_100PCs.eigenval", sep="/")
eigenvalues.df <- read.table(eigenvalues_file, header=F, sep="\t",quote="")
# IKGP phenotypes
source_folder="/Users/alexey/Documents/resources/1kgp"
kg_phenotypes_file <- paste(source_folder, "integrated_call_samples_v3.20130502.ALL.panel", sep="/")
kg_phenotypes.df <- read.table(kg_phenotypes_file, header=T)
# Thresholds to select outliers
# (manually selected on the basis of visual assessment of plots)
pc1_th <- 0.006
pc2_th <- 0.009
# Clean-up
rm(source_folder, eigenvectors_file, eigenvalues_file, kg_phenotypes_file, genotypes.mx, variants.df)
ls()
## [1] "base_folder" "eigenvalues.df" "eigenvectors.df" "kg_phenotypes.df" "pc1_th" "pc2_th" "phenotypes.df"
dim(eigenvectors.df)
## [1] 3019 102
dim(eigenvalues.df)
## [1] 100 1
dim(kg_phenotypes.df)
## [1] 2504 4
dim(phenotypes.df)
## [1] 515 24
table(phenotypes.df$cc)
##
## 0 1
## 258 257
eigenvectors.df[1:5,1:5]
## FID IID PC1 PC2 PC3
## 1 HG00096 HG00096 -0.0123803 -0.02318330 -0.01229230
## 2 HG00097 HG00097 -0.0116563 -0.01401060 0.00149094
## 3 HG00099 HG00099 -0.0101549 -0.00926998 -0.00632705
## 4 HG00100 HG00100 -0.0105016 -0.02286040 -0.00877107
## 5 HG00101 HG00101 -0.0113586 -0.01951960 0.00280684
rownames(eigenvectors.df) <- eigenvectors.df$FID
eigenvectors.df <- eigenvectors.df[,-1]
"sample" -> colnames(eigenvectors.df)[1]
eigenvectors.df[1:5,1:5]
## sample PC1 PC2 PC3 PC4
## HG00096 HG00096 -0.0123803 -0.02318330 -0.01229230 0.01548540
## HG00097 HG00097 -0.0116563 -0.01401060 0.00149094 0.01210660
## HG00099 HG00099 -0.0101549 -0.00926998 -0.00632705 -0.01006840
## HG00100 HG00100 -0.0105016 -0.02286040 -0.00877107 0.00864473
## HG00101 HG00101 -0.0113586 -0.01951960 0.00280684 0.01152710
plot(eigenvalues.df$V1, type="b", ylab="Variance",
main="Top 100 eigenvectors")
plot(eigenvalues.df$V1[1:10], type="b", ylab="Variance",
main="Top 10 eigenvectors")
rm(eigenvalues.df)
Expected order of samples:
# Check the expected order of samples
eigenvectors.df[c(2504,2505),c("sample","PC1")]
## sample PC1
## NA21144 NA21144 -0.00761811
## 100_S8_L007 100_S8_L007 -0.01287310
# Prepare phenotypes for ampliseq-kgen data
phenotypes_ampliseq.df <- phenotypes.df[1:515,c("long_ids","cc")]
table(phenotypes_ampliseq.df$cc)
##
## 0 1
## 258 257
"UBC" -> phenotypes_ampliseq.df$cc[phenotypes_ampliseq.df$cc==0]
"CBC" -> phenotypes_ampliseq.df$cc[phenotypes_ampliseq.df$cc==1]
table(phenotypes_ampliseq.df$cc)
##
## CBC UBC
## 257 258
c("sample","group") -> colnames(phenotypes_ampliseq.df)
phenotypes_kgen.df <- kg_phenotypes.df[,c("sample","super_pop")]
c("sample","group") -> colnames(phenotypes_kgen.df)
phenotypes_ampliseq_kgen.df <- rbind(phenotypes_kgen.df,phenotypes_ampliseq.df)
table(phenotypes_ampliseq_kgen.df$group)
##
## AFR AMR CBC EAS EUR SAS UBC
## 661 347 257 504 503 489 258
# Add eigenvectors to phenotypes
eigenphen_ampliseq_kgen.df <- full_join(
phenotypes_ampliseq_kgen.df, eigenvectors.df[,1:6], by="sample")
dim(eigenphen_ampliseq_kgen.df)
## [1] 3019 7
head(eigenphen_ampliseq_kgen.df)
## sample group PC1 PC2 PC3 PC4 PC5
## 1 HG00096 EUR -0.0123803 -0.02318330 -0.012292300 0.01548540 0.01744860
## 2 HG00097 EUR -0.0116563 -0.01401060 0.001490940 0.01210660 0.02883240
## 3 HG00099 EUR -0.0101549 -0.00926998 -0.006327050 -0.01006840 0.02283990
## 4 HG00100 EUR -0.0105016 -0.02286040 -0.008771070 0.00864473 0.00241337
## 5 HG00101 EUR -0.0113586 -0.01951960 0.002806840 0.01152710 -0.00587478
## 6 HG00102 EUR -0.0104850 -0.01412700 0.000701292 0.01694920 0.00885609
tail(eigenphen_ampliseq_kgen.df)
## sample group PC1 PC2 PC3 PC4 PC5
## 3014 95_S517_L008 CBC -0.01218660 -0.0047522 -0.00531965 0.01171110 0.000949141
## 3015 96_S236_L007 CBC -0.00991351 -0.0131943 -0.00128169 0.00433099 0.002379520
## 3016 97_S509_L008 UBC -0.01258500 -0.0143450 -0.01745760 0.00446267 -0.004143000
## 3017 98_S335_L008 UBC -0.01185330 -0.0136037 -0.00539617 0.00608450 0.008774560
## 3018 99_S418_L008 CBC -0.01344640 -0.0149487 -0.01010680 0.00732798 -0.004143990
## 3019 9_S346_L008 CBC -0.01171660 -0.0173992 0.01152830 0.02320550 0.016020500
# Clean-up
rm(kg_phenotypes.df, phenotypes_ampliseq.df, phenotypes_ampliseq_kgen.df,
phenotypes.df, phenotypes_kgen.df, eigenvectors.df)
# Prepare vector fr colour scale
myColours <- c("EUR"="BLUE", "AFR"="YELLOW", "AMR"="GREEN",
"SAS"="GREY", "EAS"="AQUAMARINE",
"UBC"="SALMON", "CBC"="RED")
myColourScale <- scale_colour_manual(values=myColours)
# Static plot
ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) +
geom_point(aes(col=group)) +
labs(title="WECARE Ampliseq PCA",
subtitle="based on 622 non-rare variants not in LD", x="PC1", y="PC2") +
geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
myColourScale
# Interactive plot
plotly_group <- factor(eigenphen_ampliseq_kgen.df$group,
levels=c("AFR","AMR","EAS","SAS","EUR","UBC","CBC"))
g <- ggplot(eigenphen_ampliseq_kgen.df, aes(PC1,PC2)) +
geom_point(aes(col=plotly_group, text=sample)) +
labs(title="WECARE Ampliseq PCA", x="PC1", y="PC2") +
theme(legend.title=element_blank()) + # To suppress the legend title, otherwise it would be "plotly_group
geom_vline(xintercept=pc1_th, linetype="dashed", size=0.5) +
geom_hline(yintercept=pc2_th, linetype="dashed", size=0.5) +
myColourScale
## Warning: Ignoring unknown aesthetics: text
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(myColours, myColourScale, plotly_group, g)
# Select ampliseq IDs
eigenphen_ampliseq_kgen.df$sample[2504:2505]
## [1] "NA21144" "100_S8_L007"
ampliseq_ids <- eigenphen_ampliseq_kgen.df$sample[2505:3019]
# Select outliers
pc1_outliers <- eigenphen_ampliseq_kgen.df$PC1 > pc1_th & eigenphen_ampliseq_kgen.df$sample %in% ampliseq_ids
pc2_outliers <- eigenphen_ampliseq_kgen.df$PC2 > pc2_th & eigenphen_ampliseq_kgen.df$sample %in% ampliseq_ids
pca_outliers <- pc1_outliers | pc2_outliers
sum(pc1_outliers)
## [1] 20
sum(pc2_outliers)
## [1] 14
sum(pca_outliers)
## [1] 34
joined_outliers <- eigenphen_ampliseq_kgen.df[pca_outliers,"sample"]
joined_outliers
## [1] "133_S168_L007" "141_S158_L007" "148_S432_L008" "16_S109_L007" "235_S535_L008" "238_S520_L008" "246_S375_L008" "256_S513_L008" "267_S48_L007" "273_S150_L007" "275_S22_L007" "277_S292_L008" "285_S374_L008" "289_S69_L007" "293_S479_L008" "308_S434_L008" "313_S362_L008" "323_S469_L008" "326_S317_L008" "329_S373_L008" "330_S409_L008" "347_S36_L007" "352_S435_L008" "355_S365_L008" "366_S293_L008" "368_S46_L007" "369_S230_L007" "372_S340_L008" "375_S140_L007" "385_S305_L008" "388_S259_L007" "3_S360_L008" "403_S210_L007" "408_S130_L007"
# Compare with previous list of 48 outliers
previous_outliers <- c("103_S147_L007","11_S358_L008","133_S168_L007","139_S123_L007","141_S158_L007","146_S408_L008","148_S432_L008","16_S109_L007","1_S441_L008","213_S257_L007","235_S535_L008","238_S520_L008","246_S375_L008","248_S188_L007","256_S513_L008","267_S48_L007","273_S150_L007","275_S22_L007","276_S209_L007","277_S292_L008","285_S374_L008","287_S10_L007","289_S69_L007","293_S479_L008","308_S434_L008","313_S362_L008","323_S469_L008","326_S317_L008","329_S373_L008","330_S409_L008","347_S36_L007","352_S435_L008","355_S365_L008","35_S9_L007","366_S293_L008","368_S46_L007","369_S230_L007","372_S340_L008","375_S140_L007","376_S84_L007","381_S277_L008","385_S305_L008","388_S259_L007","3_S360_L008","403_S210_L007","407_S106_L007","408_S130_L007","94_S143_L007")
x <- setdiff(previous_outliers,joined_outliers)
length(x)
## [1] 14
x
## [1] "103_S147_L007" "11_S358_L008" "139_S123_L007" "146_S408_L008" "1_S441_L008" "213_S257_L007" "248_S188_L007" "276_S209_L007" "287_S10_L007" "35_S9_L007" "376_S84_L007" "381_S277_L008" "407_S106_L007" "94_S143_L007"
y <- setdiff(joined_outliers,previous_outliers)
length(y)
## [1] 0
y
## character(0)
z <- intersect(previous_outliers,joined_outliers)
length(z)
## [1] 34
z
## [1] "133_S168_L007" "141_S158_L007" "148_S432_L008" "16_S109_L007" "235_S535_L008" "238_S520_L008" "246_S375_L008" "256_S513_L008" "267_S48_L007" "273_S150_L007" "275_S22_L007" "277_S292_L008" "285_S374_L008" "289_S69_L007" "293_S479_L008" "308_S434_L008" "313_S362_L008" "323_S469_L008" "326_S317_L008" "329_S373_L008" "330_S409_L008" "347_S36_L007" "352_S435_L008" "355_S365_L008" "366_S293_L008" "368_S46_L007" "369_S230_L007" "372_S340_L008" "375_S140_L007" "385_S305_L008" "388_S259_L007" "3_S360_L008" "403_S210_L007" "408_S130_L007"
# Clean-up
rm(ampliseq_ids, pc1_outliers, pc2_outliers, pca_outliers, previous_outliers, x,y,z)
save.image(paste(base_folder, "s01_joined_PCA_plots_622_3019_not_rare_not_in_LD.RData", sep="/"))
ls()
## [1] "base_folder" "eigenphen_ampliseq_kgen.df" "joined_outliers" "pc1_th" "pc2_th"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.9.0 ggplot2_3.2.0 dplyr_0.8.1 knitr_1.23
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 later_0.8.0 pillar_1.4.1 compiler_3.5.1 tools_3.5.1 digest_0.6.19 jsonlite_1.6 evaluate_0.14 tibble_2.1.3 gtable_0.3.0 viridisLite_0.3.0 pkgconfig_2.0.2 rlang_0.3.4 shiny_1.3.2 crosstalk_1.0.0 yaml_2.2.0 xfun_0.7 withr_2.1.2 stringr_1.4.0 httr_1.4.0 htmlwidgets_1.3 grid_3.5.1 tidyselect_0.2.5 glue_1.3.1 data.table_1.12.2 R6_2.4.0 rmarkdown_1.13 purrr_0.3.2 tidyr_0.8.3 magrittr_1.5 promises_1.0.1 scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1 xtable_1.8-4 mime_0.7 colorspace_1.4-1 httpuv_1.5.1 labeling_0.3 stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
Sys.time()
## [1] "2019-10-01 20:10:22 BST"